Analysis: Clustifyr example

an example showing the use of clustifyr to identify cell types

Author
Affiliation

BBSR Bioinformatics

1 Notes

Here we will load the 10X Genomics PBMC dataset, process it in the standard Seurat tutorial method and then demonstrate the use clustifyr to identify cell types.

The clustifyr tutorial can be found here: https://www.bioconductor.org/packages/release/bioc/vignettes/clustifyr/inst/doc/clustifyr.html

1.1 Samples

Dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10X Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. The raw data can be found here: https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz

2 Environment

knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.align='center')
library(dplyr)
library(tidyr)
library(purrr)
library(Seurat)
library(patchwork)
library(clustifyr)
library(ComplexHeatmap)
library(DT)

set.seed(87645893)

3 Load data

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
so <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
so
An object of class Seurat 
13714 features across 2700 samples within 1 assay 
Active assay: RNA (13714 features, 0 variable features)
 1 layer present: counts

4 Preprocess data

so[["percent.mt"]] <- PercentageFeatureSet(so, pattern = "^MT-")
VlnPlot(so, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(so, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(so, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# Filter cells based on nCount_RNA, nFeature_RNA, and percent.mt
so <- subset(so, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

5 Normalize data

# Normalize the data
so <- NormalizeData(so, normalization.method = "LogNormalize", scale.factor = 10000)

6 Identify variable features

# Identify the 2,000 most highly variable features
so <- FindVariableFeatures(so, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(so), 10)
plot1 <- VariableFeaturePlot(so)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

7 Scale data

# Scale the data
so <- ScaleData(so, features = rownames(so))

8 Perform PCA

# Perform PCA
so <- RunPCA(so, features = VariableFeatures(object = so))
print(so[["pca"]], dims = 1:2, ncol = 2)
PC_ 1 
Positive:  CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP 
       FCER1G, CFD, LGALS1, S100A8, CTSS, LGALS2, SERPINA1, IFITM3, SPI1, CFP 
Negative:  MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CD27, STK17A, CTSW 
       CD247, GIMAP5, AQP3, CCL5, SELL, TRAF3IP3, GZMA, MAL, CST7, ITM2A 
PC_ 2 
Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74 
       HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB 
Negative:  NKG7, PRF1, CST7, GZMB, GZMA, FGFBP2, CTSW, GNLY, B2M, SPON2 
       CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX 
VizDimLoadings(so, dims = 1:2, reduction = "pca")

DimPlot(so, reduction = "pca")

DimHeatmap(so, dims = 1:15, cells = 500, balanced = TRUE)

9 Determine the dimensionality of the data

ElbowPlot(so)

10 Cluster the cells

# Cluster the cells
so <- FindNeighbors(so, dims = 1:10)
so <- FindClusters(so, resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 95927

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8728
Number of communities: 9
Elapsed time: 0 seconds
so <- RunUMAP(so, dims = 1:10)

DimPlot(so, reduction = "umap", label = TRUE)

11 Clustifyr: Using a gene marker list

This examples uses the PanglaoDB marker gene list to identify cell types in the PBMC dataset.

11.1 Load marker gene list

# load panglaodb markers tsv
markers <- read.csv("../marker-gene-lists/PanglaoDB_markers_27_Mar_2020.tsv", sep = "\t")

datatable(markers, options = list(pageLength = 5))

11.2 Filter for human genes

#filter for human genes; species column contains Hs for human
human_markers <- markers[grepl("Hs", markers$species),]

11.3 Convert to dataframe with columns as cell type and rows as gene names

This is the format that clustifyr expects the marker gene list to be in. NAs should pad where there are fewer genes for a given cell type.

# convert to dataframe with columns as cell type and rows as gene names
gene_list <- human_markers %>%
  select(cell.type, official.gene.symbol) %>%
  distinct() %>%
  group_split(cell.type) %>%
  setNames(map_chr(., ~ .x$cell.type[1])) %>%
  map(~ .x$official.gene.symbol)

# Pad to equal length and bind as dataframe
max_len <- max(lengths(gene_list))
cell_type_markers <- gene_list %>%
  map(~ c(.x, rep(NA, max_len - length(.x)))) %>%
  bind_cols()

datatable(cell_type_markers, options = list(pageLength = 10))

11.4 Optionally filter for cell types of interest

# Optionally filter column names for cell types of interest
cell_type_markers <- cell_type_markers %>%
  select("B cell", "T cell", "NK cell", "Monocyte", "Macrophage", "Dendritic cell", "Mast cell", "Neutrophil", "Eosinophil", "Basophil", "Fibroblast", "Endothelial")

11.5 Run clustify_lists() to generate correlation matrix

# Available metrics include: "hyper", "jaccard", "spearman", "gsea"
list_res <- clustify_lists(
    input = so, # matrix of normalized single-cell RNA-seq counts
    cluster_col = "RNA_snn_res.0.5", # name of column in meta.data containing cell clusters
    marker = cell_type_markers, # list of known marker genes
    marker_inmatrix = FALSE,
    metric = "jaccard", # test to use for assigning cell types
    obj_out = FALSE # return Seurat object
)

11.6 Plot heatmap of correlation matrix

plot_cor_heatmap(
    cor_mat = list_res, # matrix of correlation coefficients from clustify_lists()
    cluster_rows = TRUE, # cluster by row
    cluster_columns = TRUE, # cluster by column
    legend_title = "jaccard" # title of heatmap legend
)

datatable(list_res, options = list(pageLength = 5))

11.7 Assign cell types to Seurat object using clustify_lists()

so_res <- clustify_lists(
    input = so, # matrix of normalized single-cell RNA-seq counts
    cluster_col = "RNA_snn_res.0.5", # name of column in meta.data containing cell clusters
    marker = cell_type_markers, # list of known marker genes
    marker_inmatrix = FALSE,
    metric = "jaccard", # test to use for assigning cell types
    obj_out = TRUE # return Seurat object
)

# clustifyr stores the cell type assignments in the metadata column "type"
so_res <- SetIdent(so_res, value = "type")

DimPlot(so_res, reduction="umap", label = TRUE, , pt.size = 0.2) + NoLegend()

12 Clustifyr: Using an existing annotated Seurat object

12.1 Load reference Seurat object

# Load the Seurat object
so_ref <- readRDS("so_ref.rds")
so_ref
An object of class Seurat 
13714 features across 2638 samples within 1 assay 
Active assay: RNA (13714 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 2 dimensional reductions calculated: pca, umap
datatable(so_ref@meta.data)

12.2 Build reference matrix from previously annotated Seurat object

ref_mat <- seurat_ref(
  seurat_object = so_ref,
  cluster_col = "cell_types"
)

12.3 Run clustify directly on your unannotated Seurat object

so_res2 <- clustify(
  input = so,                      # unannotated Seurat object
  ref_mat = ref_mat,               # matrix of average expression per reference cell type
  cluster_col = "seurat_clusters", # or whatever your cluster column is
  obj_out = FALSE                   # return Seurat object with metadata updated
)

plot_cor_heatmap(
  cor_mat = so_res2,
  cluster_rows = TRUE,
  cluster_columns = TRUE)

so_res2 <- clustify(
  input = so,                      # unannotated Seurat object
  ref_mat = ref_mat,               # matrix of average expression per reference cell type
  cluster_col = "seurat_clusters", # or whatever your cluster column is
  obj_out = TRUE                   # return Seurat object with metadata updated
)

# clustifyr stores the cell type assignments in the metadata column "type"
so_res2 <- SetIdent(so_res2, value = "type")
DimPlot(so_res2, reduction="umap", label = TRUE, pt.size = 0.2) + NoLegend()

13 Session info

sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.3.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Denver
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] DT_0.33               ComplexHeatmap_2.22.0 clustifyr_1.18.0     
 [4] patchwork_1.3.0       Seurat_5.2.1          SeuratObject_5.0.2   
 [7] sp_2.2-0              purrr_1.0.4           tidyr_1.3.1          
[10] dplyr_1.1.4          

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.22            splines_4.4.2              
  [3] later_1.4.1                 tibble_3.2.1               
  [5] polyclip_1.10-7             fastDummies_1.7.5          
  [7] lifecycle_1.0.4             doParallel_1.0.17          
  [9] globals_0.16.3              lattice_0.22-6             
 [11] MASS_7.3-61                 crosstalk_1.2.1            
 [13] magrittr_2.0.3              sass_0.4.9                 
 [15] plotly_4.10.4               rmarkdown_2.29             
 [17] jquerylib_0.1.4             yaml_2.3.10                
 [19] httpuv_1.6.15               sctransform_0.4.1          
 [21] spam_2.11-1                 spatstat.sparse_3.1-0      
 [23] reticulate_1.41.0.1         cowplot_1.1.3              
 [25] pbapply_1.7-2               RColorBrewer_1.1-3         
 [27] abind_1.4-8                 zlibbioc_1.52.0            
 [29] Rtsne_0.17                  GenomicRanges_1.58.0       
 [31] BiocGenerics_0.52.0         circlize_0.4.16            
 [33] GenomeInfoDbData_1.2.13     IRanges_2.40.1             
 [35] S4Vectors_0.44.0            ggrepel_0.9.6              
 [37] irlba_2.3.5.1               listenv_0.9.1              
 [39] spatstat.utils_3.1-3        goftest_1.2-3              
 [41] RSpectra_0.16-2             spatstat.random_3.3-3      
 [43] fitdistrplus_1.2-2          parallelly_1.43.0          
 [45] codetools_0.2-20            DelayedArray_0.32.0        
 [47] tidyselect_1.2.1            shape_1.4.6.1              
 [49] UCSC.utils_1.2.0            farver_2.1.2               
 [51] matrixStats_1.5.0           stats4_4.4.2               
 [53] spatstat.explore_3.4-2      jsonlite_1.9.1             
 [55] GetoptLong_1.0.5            progressr_0.15.1           
 [57] ggridges_0.5.6              survival_3.7-0             
 [59] iterators_1.0.14            foreach_1.5.2              
 [61] tools_4.4.2                 ica_1.0-3                  
 [63] Rcpp_1.0.14                 glue_1.8.0                 
 [65] gridExtra_2.3               SparseArray_1.6.2          
 [67] xfun_0.51                   MatrixGenerics_1.18.1      
 [69] GenomeInfoDb_1.42.3         withr_3.0.2                
 [71] BiocManager_1.30.25         fastmap_1.2.0              
 [73] entropy_1.3.1               digest_0.6.37              
 [75] R6_2.6.1                    mime_0.13                  
 [77] colorspace_2.1-1            Cairo_1.6-2                
 [79] scattermore_1.2             tensor_1.5                 
 [81] spatstat.data_3.1-6         generics_0.1.3             
 [83] renv_1.1.1                  data.table_1.17.0          
 [85] httr_1.4.7                  htmlwidgets_1.6.4          
 [87] S4Arrays_1.6.0              uwot_0.2.3                 
 [89] pkgconfig_2.0.3             gtable_0.3.6               
 [91] lmtest_0.9-40               SingleCellExperiment_1.28.1
 [93] XVector_0.46.0              htmltools_0.5.8.1          
 [95] dotCall64_1.2               fgsea_1.32.2               
 [97] clue_0.3-66                 scales_1.3.0               
 [99] Biobase_2.66.0              png_0.1-8                  
[101] spatstat.univar_3.1-2       knitr_1.50                 
[103] rstudioapi_0.17.1           reshape2_1.4.4             
[105] rjson_0.2.23                nlme_3.1-166               
[107] cachem_1.1.0                zoo_1.8-13                 
[109] GlobalOptions_0.1.2         stringr_1.5.1              
[111] KernSmooth_2.23-24          parallel_4.4.2             
[113] miniUI_0.1.1.1              vipor_0.4.7                
[115] ggrastr_1.0.2               pillar_1.10.1              
[117] vctrs_0.6.5                 RANN_2.6.2                 
[119] promises_1.3.2              xtable_1.8-4               
[121] cluster_2.1.6               beeswarm_0.4.0             
[123] evaluate_1.0.3              cli_3.6.4                  
[125] compiler_4.4.2              rlang_1.1.5                
[127] crayon_1.5.3                future.apply_1.11.3        
[129] labeling_0.4.3              plyr_1.8.9                 
[131] ggbeeswarm_0.7.2            stringi_1.8.4              
[133] viridisLite_0.4.2           deldir_2.0-4               
[135] BiocParallel_1.40.0         munsell_0.5.1              
[137] lazyeval_0.2.2              spatstat.geom_3.3-6        
[139] Matrix_1.7-1                RcppHNSW_0.6.0             
[141] future_1.34.0               ggplot2_3.5.1              
[143] shiny_1.10.0                SummarizedExperiment_1.36.0
[145] ROCR_1.0-11                 igraph_2.1.4               
[147] bslib_0.9.0                 fastmatch_1.1-6